Jiayi Xu
This data set contains booking information for a city hotel and a resort hotel, and includes information such as when the booking was made, length of stay, the number of adults, children, and/or babies, and the number of available parking spaces, among other things.
From the publication we know that both hotels are located in Portugal (southern Europe) ("H1 at the resort region of Algarve and H2 at the city of Lisbon"). The distance between these two locations is ca. 280 km by car and both locations border on the north atlantic. The data contains "bookings due to arrive between the 1st of July of 2015 and the 31st of August 2017".
To predict the possibility of a booking for a hotel based on the previous_cancellation.
Use Logistics regression, Decison Tree, Random Forest, Gradient Boosting and K-Nearest Neighbors modeling methods.
# Load libraries
# general data manipulation
import pandas as pd
import numpy as np
# general visualization
import seaborn as sns
import plotly
from plotly import __version__
from plotly.offline import download_plotlyjs, plot, iplot
import chart_studio.plotly as py
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as pyo
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
plotly.offline.init_notebook_mode()
# forecast + modeling
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import scale
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
# Load data
df = pd.read_csv('/Users/xujiayi/Desktop/DS面试题/Week2/hotel_bookings.csv')
# Number of rows/columns
df.shape #The dataset initially contains 119,390 rows (hotel booking information) and 32 columns (‘variables’ or ‘features’ of each booking information).
df.head()
1.Hotel: (H1 = Resort Hotel or H2 = City Hotel)
2.is_canceled: Value indicating if the booking was canceled (1) or not (0)
3.lead_time: Number of days that elapsed between the entering date of the booking into the PMS and the arrival date
4.arrival_date_year:Year of arrival date
5.arrival_date_month: Month of arrival date
6.arrival_date_week_number:Week number of year for arrival date
7.arrival_date_day_of_month:Day of arrival date
8.stays_in_weekend_nights:Number of weekend nights (Saturday or Sunday) the guest stayed or booked to stay at the hotel
9.stays_in_week_nights: Number of week nights (Monday to Friday) the guest stayed or booked to stay at the hotel
10.adults:Number of adults
11.children:Number of children
12.meal:Type of meal booked. Categories are presented in standard hospitality meal packages: Undefined/SC – no meal package; BB – Bed & Breakfast; HB – Half board (breakfast and one other meal – usually dinner); FB – Full board (breakfast, lunch and dinner)
13.country:Country of origin
14.market_segment:Market segment designation. In categories, the term “TA” means “Travel Agents” and “TO” means “Tour Operators”
15.distribution_channel:Booking distribution channel. The term “TA” means “Travel Agents” and “TO” means “Tour Operators”
16.is_repeated_guest: Value indicating if the booking name was from a repeated guest (1) or not (0)
17.previous_cancellations: Number of previous bookings that were cancelled by the customer prior to the current booking
18.previous_bookings_not_canceled:Number of previous bookings not cancelled by the customer prior to the current booking
19.reserved_room_type: Code of room type reserved. Code is presented instead of designation for anonymity reasons.
20.assigned_room_type: Code for the type of room assigned to the booking. Sometimes the assigned room type differs from the reserved room type due to hotel operation reasons (e.g. overbooking) or by customer request. Code is presented instead of designation for anonymity reasons.
21.booking_changes: Number of changes/amendments made to the booking from the moment the booking was entered on the PMS until the moment of check-in or cancellation
22.deposit_type: Indication on if the customer made a deposit to guarantee the booking. This variable can assume three categories: No Deposit – no deposit was made; Non Refund – a deposit was made in the value of the total stay cost; Refundable – a deposit was made with a value under the total cost of stay.
23.agent: ID of the travel agency that made the booking
24.company: ID of the company/entity that made the booking or responsible for paying the booking. ID is presented instead of designation for anonymity reasons
25.days_in_waiting_list:Number of days the booking was in the waiting list before it was confirmed to the customer
26.customer_type: Type of booking, assuming one of four categories: Contract - when the booking has an allotment or other type of contract associated to it; Group – when the booking is associated to a group; Transient – when the booking is not part of a group or contract, and is not associated to other transient booking; Transient-party – when the booking is transient, but is associated to at least other transient booking
27.adr: Average Daily Rate as defined by dividing the sum of all lodging transactions by the total number of staying nights
28.required_car_parking_spaces :Number of car parking spaces required by the customer
29.total_of_special_requests:Number of special requests made by the customer (e.g. twin bed or high floor)
30.reservation_status:Reservation last status, assuming one of three categories: Canceled – booking was canceled by the customer; Check-Out – customer has checked in but already departed; No-Show – customer did not check-in and did inform the hotel of the reason why
31.reservation_status_date:Date at which the last status was set. This variable can be used in conjunction with the ReservationStatus to understand when was the booking canceled or when did the customer checked-out of the hotel
32 babies :Number of babies
# missing value inspection
df.isna().sum()
# 4 missing information in Children column
df["children"] = df["children"].fillna(0)
# 488 missing information in country
df["country"] = df['country'].fillna("Blan")
#Decriptive summary, look at target variable
print(df.groupby('hotel')['is_canceled'].value_counts())
# Booking cancellation summary
ch = df.loc[df["hotel"] == "City Hotel"].shape[0]
rh = df.loc[df["hotel"] == "Resort Hotel"].shape[0]
ch_rate = (df.loc[df["hotel"] == "City Hotel"].shape[0])/df.shape[0]*100
rh_rate = (df.loc[df["hotel"] == "Resort Hotel"].shape[0])/df.shape[0]*100
total_cancellations = df["is_canceled"].sum()
rh_cancellations = df.loc[df["hotel"] == "Resort Hotel"]["is_canceled"].sum()
ch_cancellations = df.loc[df["hotel"] == "City Hotel"]["is_canceled"].sum()
# Cancellation rate per hotel category
rate_cancel = total_cancellations / df.shape[0] * 100
rh_rate_cancel = rh_cancellations / df.loc[df["hotel"] == "Resort Hotel"].shape[0] * 100
ch_rate_cancel = ch_cancellations / df.loc[df["hotel"] == "City Hotel"].shape[0] * 100
print("")
print(f"Total City Hotel bookings: {ch:,}({ch_rate:.2f} %)")
print(f"Total Resort Hotel bookings: {rh:,}({rh_rate:.2f} %)")
print(f"Total bookings canceled: {total_cancellations:}({rate_cancel:.2f} %) ")
print(f"Resort hotel bookings canceled: {rh_cancellations:,} ({rh_rate_cancel:.2f}%)")
print(f"City hotel bookings canceled: {ch_cancellations:,} ({ch_rate_cancel:.2f} %)")
#Dataset summary statistics by hotel
city = df.loc[df["hotel"] == "City Hotel"]
resort = df.loc[df["hotel"] == "Resort Hotel"]
city.describe()
resort.describe()
Before diving into the univariate analysis, attributes have been roughly divided into four groups: demographic related, time-related(seasonality), booking behaviors and stay behaviors.
df['country'].unique()
##### CN" and "CHN" should both represents China.
df.loc[df.country == 'CN','country'] = 'CHN'
# Which nationality is most likely to cancel the booking
# Data cleaning
country_data = pd.DataFrame(df.loc[df["is_canceled"] == 1]["country"].value_counts())
country_data.rename(columns = {"country": "Number of Guests"}, inplace=True)
country_data["Country"] = country_data.index
total_guests = country_data["Number of Guests"].sum()
country_data["Guests in %"] = round(country_data["Number of Guests"] / total_guests * 100, 3)
# Show the Guest country distribution piechart
country_fig = px.pie(country_data, values='Number of Guests', names='Country',
title='Cancellation nationality distribution piechart',
hover_data=['Guests in %'], labels={'Guests in %':'Guests in %'})
country_fig.update_traces(textposition='inside', textinfo='percent+label')
#country_fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
country_fig.show()
# Where did the guests come from?
# Data cleaning
country_data = pd.DataFrame(df.loc[df["is_canceled"] == 0]["country"].value_counts())
country_data.rename(columns = {"country": "Number of Guests"}, inplace=True)
country_data["Country"] = country_data.index
total_guests = country_data["Number of Guests"].sum()
country_data["Guests in %"] = round(country_data["Number of Guests"] / total_guests * 100, 3)
# Show the Guest nationality distribution piechart
country_fig = px.pie(country_data, values='Number of Guests', names='Country',
title='Guests nationality distribution piechart',
hover_data=['Guests in %'], labels={'Guests in %':'Guests in %'})
country_fig.update_traces(textposition='inside', textinfo='percent+label')
#country_fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
country_fig.show()
c = city.loc[city['is_canceled'] == 0].country
r = resort.loc[resort['is_canceled'] == 0].country
# city hotel
plt.figure(figsize = (10,3))
sns.countplot(y = c, data = city, palette = 'Paired',
order = c.value_counts().iloc[:7].index)
plt.xticks(np.arange(0, 13000, 2000))
plt.title('Top 7 City Hotel guest nationality')
plt.show()
# resort hotel
plt.figure(figsize = (10, 3))
sns.countplot(y = r, data = resort, palette = 'Paired',
order = r.value_counts().iloc[:7].index)
plt.xticks(np.arange(0, 13000, 2000))
plt.title('Top 7 Resort Hotel guest nationality')
plt.show()
fig = plt.figure(figsize=(15,18))
# Is 55 an outlier or they were coming to convention? By company?
ax1 = plt.subplot2grid((3,2),(0,0))
sns.boxplot(x="hotel", y="adults", data=df, palette="Set3")
plt.title('BoxPlot by adults',fontsize = 15, weight = 'bold')
ax1 = plt.subplot2grid((3,2),(0,1))
sns.boxplot(x='is_canceled', y="adults", data=df, palette="Set3")
plt.title('Adults BoxPlot by canceled', fontsize = 15, weight = 'bold' )
# 10 children? Outlier? Were they having a birthday party?
ax1 = plt.subplot2grid((3,2),(1,0))
sns.boxplot(x="hotel", y="children", data=df, palette="Set3")
plt.title('BoxPlot by children', fontsize = 15, weight = 'bold' )
ax1 = plt.subplot2grid((3,2),(1,1))
sns.boxplot(x='is_canceled', y="children", data=df, palette="Set3")
plt.title('Children BoxPlot by canceled', fontsize = 15, weight = 'bold' )
# 10 babies? Outlier? wrong input?
ax1 = plt.subplot2grid((3,2),(2,0))
sns.boxplot(x="hotel", y="babies", data=df, palette="Set3")
plt.title('BoxPlot by babies', fontsize = 15, weight = 'bold' )
ax1 = plt.subplot2grid((3,2),(2,1))
sns.boxplot(x='is_canceled', y="babies", data=df, palette="Set3")
plt.title('Babies BoxPlot by canceled', fontsize = 15, weight = 'bold' )
fig = plt.figure()
fig = plt.figure(figsize=(15,18))
# Did repeated customers cancel the booking?
ax1 = plt.subplot2grid((3,2),(0,0))
plt.title("Cancellation distribution by Customer type ", fontdict = {'fontsize': 16})
ax = sns.countplot(x = "customer_type", hue = 'is_canceled', data = df)
for p in ax.patches:
ax.annotate((p.get_height()),(p.get_x()+0.4 , p.get_height()+100))
# Did repeated customers cancel the booking?
ax1 = plt.subplot2grid((3,2),(0,1))
plt.title("Cancellation distribution by Repeated guest", fontdict = {'fontsize': 16})
ax = sns.countplot(x = "is_canceled", hue = 'is_repeated_guest', data = df)
for p in ax.patches:
ax.annotate((p.get_height()),(p.get_x()+0.4 , p.get_height()+100))
# lead time analysis
g = (sns.FacetGrid(df, hue = 'is_canceled',
height = 6,
xlim = (0,500))
.map(sns.kdeplot, 'lead_time', shade = True)
.add_legend());
g.fig.suptitle('Density Curve of Lead Time by Cancelation', fontsize=16)
# Approach to the seasonal booking
a = df.loc[df['is_canceled'] == 0].arrival_date_month
b = df.loc[df['is_canceled'] == 1].arrival_date_month
n_bins = 12
fig, axe = plt.subplots(figsize=(12, 6))
colors = ['skyblue', 'darkblue']
axe.hist([a, b], n_bins, histtype='bar', color=colors, label=['0', '1'])
axe.set_xlabel('arrival_date_month')
axe.legend()
plt.title("Seasonal booking Cancellation histogram")
fig.tight_layout()
plt.show()
df_can = df.loc[df["is_canceled"] == 1]
df_can['Date'] = pd.to_datetime(df_can[['arrival_date_year', 'arrival_date_month', 'arrival_date_day_of_month']].astype(str).agg('-'.join, axis=1))
# City hotel & resort hotel cancellation guests visualization
chdf_can = df_can.loc[df_can["hotel"] == "City Hotel"]
rhdf_can = df_can.loc[df["hotel"] == "Resort Hotel"]
chdf_can_guests_daily = chdf_can.groupby("Date")["hotel"].count()
rhdf_can_guests_daily = rhdf_can.groupby("Date")["hotel"].count()
city_guest_data = pd.DataFrame({"Date": list(chdf_can_guests_daily.index),
"hotel": "City hotel",
"canceled guests": list(chdf_can_guests_daily.values)})
resort_guest_data = pd.DataFrame({"Date": list(rhdf_can_guests_daily.index),
"hotel": "Resort hotel",
"canceled guests": list(rhdf_can_guests_daily.values)})
# Time Series line chart of city hotel guests cancellation
fig = px.line(city_guest_data, x="Date", y="canceled guests", title='Time Series line chart of city hotel guests cancellation')
fig.update_xaxes(rangeslider_visible=True)
fig.show()
# Time Series line chart of Resort hotel guests cancellation
fig = px.line(resort_guest_data, x="Date", y="canceled guests", title='Time Series line chart of resort hotel guests cancellation')
fig.update_xaxes(rangeslider_visible=True)
fig.show()
# deposit_type impact on cancellation:
deposit_cancel_data = df.groupby("deposit_type")["is_canceled"].describe()
# Price
# normalize price per night (adr):
df["adr_pp"] = df["adr"] / (df["adults"] + df["children"])
df_guests = df.loc[df["is_canceled"] == 0] # only actual guests
room_prices = df_guests[["hotel", "reserved_room_type", "adr_pp"]].sort_values("reserved_room_type")
#show figure:
fig = plt.figure(figsize=(15,18))
ax1 = plt.subplot2grid((3,2),(0,0))
sns.barplot(x=deposit_cancel_data.index, y=deposit_cancel_data["mean"] * 100,palette = 'Oranges')
plt.title("Effect of deposit_type on cancellation", fontsize=16)
plt.ylabel("Cancellations [%]", fontsize=16)
plt.show()
# price boxplot:
plt.figure(figsize=(15, 8))
sns.boxplot(x = "reserved_room_type",
y = "adr_pp",
hue = "hotel",
data = room_prices,
hue_order = ["City Hotel", "Resort Hotel"],
fliersize=0)
plt.title("Price boxplot of room types per night and person", fontsize=16)
plt.xlabel("Room type", fontsize=16)
plt.ylabel("Price [EUR]", fontsize=16)
plt.legend(loc="upper right")
plt.ylim(0, 160)
plt.show()
# Barplot Distributon of Segments by Cancellation
plt.figure(figsize = (15,8))
plt.title("Barplot Distributon of Segments by Cancellation", fontdict = {'fontsize':16})
ax = sns.countplot(x = "market_segment", hue = 'is_canceled', data = df)
for p in ax.patches:
ax.annotate((p.get_height()),(p.get_x()+0.4 , p.get_height()+100))
preca = df.groupby('previous_cancellations')["is_canceled"].describe()
dayswai = df.groupby("days_in_waiting_list")["is_canceled"].describe()
# Barplot Distributon of previous_cancellations by Cancellation
fig = plt.figure(figsize=(15,16))
ax1 = plt.subplot2grid((3,2),(0,0))
sns.lineplot(x=preca.index, y=preca["mean"] * 100,color ="Blue")
for p in ax.patches:
ax.annotate((p.get_height()),(p.get_x()+0.4 , p.get_height()+100))
plt.title("Effect of previous_cancellations on cancellation", fontsize=16)
plt.ylabel("Cancellations [%]", fontsize=16)
plt.show()
# Barplot Distributon of Segments by Cancellation
plt.figure(figsize=(50, 16))
sns.barplot(x=dayswai.index, y=dayswai["mean"] ,color ="Green")
plt.title("Effect of days_in_waiting_list on cancellation", fontsize=16)
plt.ylabel("Cancellations [%]", fontsize=16)
plt.show()
#stays in weekednd nights
plt.figure(figsize=(12,6))
ax = sns.countplot(x="stays_in_weekend_nights", data = df_guests, palette="winter_r")
plt.title('Total Weekend Nights Stayed')
plt.xlabel('total_nights_stayed')
plt.ylabel('Total Count')
for p in ax.patches:
ax.annotate((p.get_height()),(p.get_x()-0.1 , p.get_height()+100))
#stays in weekednd nights
plt.figure(figsize=(12,6))
ax = sns.countplot(x="stays_in_week_nights", data = df_guests, palette="summer_r")
plt.title('Total Week Nights Stayed')
plt.xlabel('total_nights_stayed')
plt.ylabel('Total Count')
for p in ax.patches:
ax.annotate((p.get_height()),(p.get_x()-0.1 , p.get_height()+100))
df['total_nights_stayed'] = df['stays_in_weekend_nights'] + df['stays_in_week_nights']
plt.figure(figsize = (12,6))
plt.title("Barplot Distributon of total_nights_stayed by Cancellation", fontdict = {'fontsize':16})
ax = sns.countplot(x = "total_nights_stayed", hue = 'is_canceled', data = df)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., fontsize=15)
for p in ax.patches:
ax.annotate((p.get_height()),(p.get_x()+0.4 , p.get_height()+100))
# Barplot Distributon of Segments by Cancellation
plt.figure(figsize = (12,6))
plt.title("Barplot Distributon of Meals by Cancellation", fontdict = {'fontsize':16})
ax = sns.countplot(x = "meal", hue = 'is_canceled', data = df)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., fontsize=15)
for p in ax.patches:
ax.annotate((p.get_height()),(p.get_x()+0.4 , p.get_height()+100))
# correlation analysis
corr_matrix = df.corr()
corr_matrix["is_canceled"].sort_values(ascending=False)
#Take selected numercial data
df_numerical = df.loc[:, ['is_canceled', 'lead_time', 'arrival_date_week_number',
'arrival_date_day_of_month', 'total_nights_stayed','adults', 'children', 'babies',
'previous_cancellations', 'previous_bookings_not_canceled',
'booking_changes', 'days_in_waiting_list', 'adr',
'required_car_parking_spaces', 'total_of_special_requests']]
# Correlation heatmap
corrmat = df_numerical.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
g=sns.heatmap(df[top_corr_features].corr(),annot=True,cmap="coolwarm")
df['OUTPUT_LABEL'] = (df.is_canceled).astype('int') # in other cases, the target variable was not 0/1.
cols_num = [ 'lead_time', 'arrival_date_week_number', 'arrival_date_day_of_month',
'total_nights_stayed','adults', 'children', 'babies',
'previous_cancellations', 'previous_bookings_not_canceled',
'booking_changes', 'days_in_waiting_list', 'adr',
'required_car_parking_spaces', 'total_of_special_requests']
df[cols_num].isnull().sum()
hotel
df.is_repeated_guest = df.is_repeated_guest.astype('category')
cols_cat = ['hotel', 'arrival_date_month', 'meal', 'market_segment', 'is_repeated_guest',
'distribution_channel', 'reserved_room_type', 'deposit_type',
'customer_type']
df[cols_cat].isnull().sum()
df_cat = pd.get_dummies(df[cols_cat])
df_cat.head()
df = pd.concat([df, df_cat], axis = 1)
cols_all_cat = list(df_cat.columns)
cols_all_cat
print('Total number of features:', len(cols_num + cols_all_cat))
print('Numerical Features:', len(cols_num))
print('Categorical Features:', len(cols_all_cat))
cols_input = cols_num + cols_all_cat
df_data = df[cols_input + ['OUTPUT_LABEL']]
df_data['children'] = (df_data.children).astype('int')
df_data.info()
# shuffle the samples
df_data = df_data.sample(n = len(df_data), random_state = 11)
df_data = df_data.reset_index(drop = True)
# Train, validation and test split
df_valid_test = df_data.sample(frac = 0.30,random_state = 11)
df_test = df_valid_test.sample(frac = 0.5, random_state = 11)
df_valid = df_valid_test.drop(df_test.index)
df_train = df_data.drop(df_valid_test.index)
X_train = df_train[cols_input].values
X_valid = df_valid[cols_input].values
X_test = df_test[cols_input].values
y_train = df_train['OUTPUT_LABEL'].values
y_valid = df_valid['OUTPUT_LABEL'].values
y_test = df_test['OUTPUT_LABEL'].values
print('Training shapes:', X_train.shape, y_train.shape)
print('Validation shapes:', X_valid.shape, y_valid.shape)
print('Test shapes:', X_test.shape, y_test.shape)
scaler = StandardScaler() #Standardize features by removing the mean and scaling to unit variance
scaler.fit(X_train)
scaler.fit(X_valid)
scaler.fit(X_test)
X_train_tf = scaler.transform(X_train)
X_valid_tf = scaler.transform(X_valid)
X_test_tf = scaler.transform(X_test)
def show_metrics(y_true, y_score):
# True positive
tp = np.sum(y_true * y_score)
# False positive
fp = np.sum((y_true == 0) * y_score)
# True negative
tn = np.sum((y_true==0) * (y_score==0))
# False negative
fn = np.sum(y_true * (y_score==0))
# Accuracy
accuracy = (tp+tn)/(tp+tn+fp+fn)
# True positive rate (sensitivity or recall)
tpr = tp / (tp + fn)
# False positive rate (fall-out)
fpr = fp / (fp + tn)
# Precision
precision = tp / (tp + fp)
# True negatvie tate (specificity)
tnr = 1 - fpr
# F1 score
f1 = 2*tp / (2*tp + fp + fn)
# MCC
mcc = (tp * tn - fp * fn) / np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
print("True positive: ", tp)
print("False positive: ", fp)
print("True negative: ", tn)
print("False negative: ", fn)
print("accuracy: ", accuracy)
print("specifity: ", tnr)
print("recall: ", tpr)
print("Precision: ", precision)
print("F1: ", f1)
print("MCC: ", mcc)
return accuracy, tnr, tpr, precision, f1, mcc
lr = LogisticRegression()
lr.fit(X_train_tf, y_train)
y_train_preds = lr.predict(X_train_tf)
y_valid_preds = lr.predict(X_valid_tf)
print('Logistic Regression')
print('Training')
lr_train_accuracy, lr_train_specificity, lr_train_recall, lr_train_precision, lr_train_f1, lr_train_MCC,\
= show_metrics(y_train, y_train_preds.astype(np.int))
print('Validation')
lr_valid_accuracy, lr_valid_specificity, lr_valid_recall, lr_valid_precision, lr_valid_f1, lr_valid_MCC, \
= show_metrics(y_valid, y_valid_preds.astype(np.int))
tree = DecisionTreeClassifier()
tree.fit(X_train_tf, y_train)
y_train_preds = tree.predict(X_train_tf)
y_valid_preds = tree.predict(X_valid_tf)
print('Decision tree')
print('Training')
tree_train_accuracy, tree_train_specificity, tree_train_recall, tree_train_precision, tree_train_f1, tree_train_MCC, \
= show_metrics(y_train, y_train_preds.astype(np.int))
print('Validation')
tree_valid_accuracy, tree_valid_specificity, tree_valid_recall, tree_valid_precision, tree_valid_f1, tree_valid_MCC, \
= show_metrics(y_valid, y_valid_preds.astype(np.int))
rf = RandomForestClassifier()
rf.fit(X_train_tf, y_train)
y_train_preds = rf.predict(X_train_tf)
y_valid_preds = rf.predict(X_valid_tf)
print('Random forest')
print('Training')
rf_train_accuracy, rf_train_specificity, rf_train_recall, rf_train_precision, rf_train_f1, rf_train_MCC,\
= show_metrics(y_train, y_train_preds.astype(np.int))
print('Validation')
rf_valid_accuracy, rf_valid_specificity, rf_valid_recall, rf_valid_precision, rf_valid_f1, rf_valid_MCC, \
= show_metrics(y_valid, y_valid_preds.astype(np.int))
gb = GradientBoostingClassifier()
gb.fit(X_train_tf, y_train)
y_train_preds = gb.predict(X_train_tf)
y_valid_preds = gb.predict(X_valid_tf)
print('Gradient boosting')
print('Training')
gb_train_accuracy, gb_train_specificity,gb_train_recall,gb_train_precision, gb_train_f1, gb_train_MCC,\
= show_metrics(y_train, y_train_preds.astype(np.int))
print('Validation')
gb_valid_accuracy,gb_valid_specificity,gb_valid_recall, gb_valid_precision, gb_valid_f1, gb_valid_MCC, \
= show_metrics(y_valid, y_valid_preds.astype(np.int))
knn = KNeighborsClassifier()
knn.fit(X_train_tf, y_train)
y_train_preds = knn.predict(X_train_tf)
y_valid_preds = knn.predict(X_valid_tf)
print('kNN')
print('Training')
knn_train_accuracy, knn_train_specificity,knn_train_recall,knn_train_precision, knn_train_f1, knn_train_MCC,\
= show_metrics(y_train, y_train_preds.astype(np.int))
print('kNN')
print('Validation')
knn_valid_accuracy,knn_valid_specificity,knn_valid_recall, knn_valid_precision, knn_valid_f1, knn_valid_MCC, \
= show_metrics(y_valid, y_valid_preds.astype(np.int))
df_results = pd.DataFrame({'classifier': ['LR','LR','DT','DT','RF','RF','GB','GB','KNN','KNN'],
'data_set': ['train','valid'] * 5,
'accuracy': [lr_train_accuracy,lr_valid_accuracy,tree_train_accuracy,tree_valid_accuracy,rf_train_accuracy,rf_valid_accuracy,gb_train_accuracy,gb_valid_accuracy,knn_train_accuracy,knn_valid_accuracy,],
'precision': [lr_train_precision,lr_valid_precision,tree_train_precision,tree_valid_precision,rf_train_precision,rf_valid_precision,gb_train_precision,gb_valid_precision,knn_train_precision,knn_valid_precision,],
'recall': [lr_train_recall,lr_valid_recall,tree_train_recall,tree_valid_recall,rf_train_recall,rf_valid_recall,gb_train_recall,gb_valid_recall,knn_train_recall,knn_valid_recall,],
'f1': [lr_train_f1,lr_valid_f1,tree_train_f1,tree_valid_f1,rf_train_f1,rf_valid_f1,gb_train_f1,gb_valid_f1,knn_train_f1,knn_valid_f1,],
'specificity': [lr_train_specificity,lr_valid_specificity,tree_train_specificity,tree_valid_specificity,rf_train_specificity,rf_valid_specificity,gb_train_specificity,gb_valid_specificity,knn_train_specificity,knn_valid_specificity,],
'MCC':[lr_train_MCC,lr_valid_MCC,tree_train_MCC,tree_valid_MCC,rf_train_MCC,rf_valid_MCC,gb_train_MCC,gb_valid_MCC,knn_train_MCC,knn_valid_MCC,]})
sns.set(style="whitegrid")
ax1 = sns.lineplot(x="classifier", y="accuracy", hue="data_set", data=df_results)
ax1.set_xlabel('Classifier',fontsize=15)
ax1.set_ylabel('accuracy', fontsize=15)
ax1.tick_params(labelsize=15)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
ax = sns.lineplot(x="classifier", y="precision", hue="data_set", data=df_results)
ax.set_xlabel('Classifier',fontsize=15)
ax.set_ylabel('precision', fontsize=15)
ax.tick_params(labelsize=15)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
ax = sns.lineplot(x="classifier", y="precision", hue="data_set", data=df_results)
ax.set_xlabel('Classifier',fontsize=15)
ax.set_ylabel('precision', fontsize=15)
ax.tick_params(labelsize=15)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
ax = sns.lineplot(x="classifier", y="recall", hue="data_set", data=df_results)
ax.set_xlabel('Classifier',fontsize=15)
ax.set_ylabel('recall', fontsize=15)
ax.tick_params(labelsize=15)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
ax = sns.lineplot(x="classifier", y="specificity", hue="data_set", data=df_results)
ax.set_xlabel('Classifier',fontsize=15)
ax.set_ylabel('specificity', fontsize=15)
ax.tick_params(labelsize=15)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
ax = sns.lineplot(x="classifier", y="MCC", hue="data_set", data=df_results)
ax.set_xlabel('Classifier',fontsize=15)
ax.set_ylabel('MCC', fontsize=15)
ax.tick_params(labelsize=15)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
fig = plt.figure()
lr = LogisticRegression()
lr.fit(X_train_tf, y_train)
feature_importances = pd.DataFrame(lr.coef_[0], index = cols_input,
columns=['importance']).sort_values('importance', ascending=False)
feature_importances.head()
num = np.min([10, len(cols_input)])
ylocs = np.arange(num)
values_to_plot = feature_importances.iloc[:num].values.ravel()[::-1]
feature_labels = list(feature_importances.iloc[:num].index)[::-1]
plt.figure(num=None, figsize=(3, 5), dpi=80, facecolor='w', edgecolor='k');
plt.barh(ylocs, values_to_plot, align = 'center')
plt.ylabel('Features')
plt.xlabel('Importance Score')
plt.title('Positive Feature Importance Score - Logistic Regression')
plt.yticks(ylocs, feature_labels)
plt.show()
values_to_plot = feature_importances.iloc[-num:].values.ravel()
feature_labels = list(feature_importances.iloc[-num:].index)
plt.figure(num=None, figsize=(3, 5), dpi=80, facecolor='w', edgecolor='k');
plt.barh(ylocs, values_to_plot, align = 'center')
plt.ylabel('Features')
plt.xlabel('Importance Score')
plt.title('Negative Feature Importance Score - Logistic Regression')
plt.yticks(ylocs, feature_labels)
plt.show()
rf = RandomForestClassifier()
rf.fit(X_train_tf, y_train)
feature_importances = pd.DataFrame(rf.feature_importances_, index = cols_input,
columns=['importance']).sort_values('importance', ascending=False)
feature_importances.head()
num = np.min([10, len(cols_input)])
ylocs = np.arange(num)
values_to_plot = feature_importances.iloc[:num].values.ravel()[::-1]
feature_labels = list(feature_importances.iloc[:num].index)[::-1]
plt.figure(num=None, figsize=(3, 5), dpi=80, facecolor='w', edgecolor='k');
plt.barh(ylocs, values_to_plot, align = 'center')
plt.ylabel('Features')
plt.xlabel('Importance Score')
plt.title('Feature Importance Score - Random Forest')
plt.yticks(ylocs, feature_labels)
plt.show()
rf = RandomForestClassifier()
rf.fit(X_train_tf, y_train)
rf.get_params()
from sklearn.model_selection import RandomizedSearchCV
# number of trees
n_estimators = range(200,1200,200)
# maximum number of features to use at each split
max_features = ['auto','sqrt']
# maximum depth of the tree
max_depth = range(2,30,2)
# minimum number of samples to split a node
min_samples_split = range(2,10,2)
# criterion for evaluating a split
criterion = ['gini','entropy']
# random grid
random_grid = {'n_estimators':n_estimators,
'max_features':max_features,
'max_depth':max_depth,
'min_samples_split':min_samples_split,
'criterion':criterion}
print(random_grid)
from sklearn.metrics import make_scorer, roc_auc_score
auc_scoring = make_scorer(roc_auc_score)
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 20, cv = 2,
scoring=auc_scoring, verbose = 1, random_state = 42)
rf_random.fit(X_train_tf, y_train)
rf_random.best_params_
rf = RandomForestClassifier()
rf.fit(X_train_tf, y_train)
y_train_preds = rf.predict_proba(X_train_tf)[:,1]
y_valid_preds = rf.predict_proba(X_valid_tf)[:,1]
print('Baseline Random forest')
rf_train_base_auc = roc_auc_score(y_train, y_train_preds)
rf_valid_base_auc = roc_auc_score(y_valid, y_valid_preds)
print('Training AUC:%.3f'%(rf_train_base_auc))
print('Validation AUC:%.3f'%(rf_valid_base_auc))
print('Optimized Random forest')
y_train_preds_random = rf_random.best_estimator_.predict_proba(X_train_tf)[:,1]
y_valid_preds_random = rf_random.best_estimator_.predict_proba(X_valid_tf)[:,1]
rf_train_opt_auc = roc_auc_score(y_train, y_train_preds_random)
rf_valid_opt_auc = roc_auc_score(y_valid, y_valid_preds_random)
print('Training AUC:%.3f'%(rf_train_opt_auc))
print('Validation AUC:%.3f'%(rf_valid_opt_auc))
gb = GradientBoostingClassifier()
gb.fit(X_train_tf, y_train)
# number of trees
n_estimators = range(30,330,10)
# maximum depth of the tree
max_depth = range(1,11,1)
# learning rate
learning_rate = [0.001,0.01,0.1]
# random grid
random_grid_gb = {'n_estimators':n_estimators,
'max_depth':max_depth,
'learning_rate':learning_rate}
# create the randomized search cross-validation
gb_random = RandomizedSearchCV(estimator = gb, param_distributions = random_grid_gb,
n_iter = 20, cv = 2, scoring = auc_scoring, verbose = 0,
random_state = 42)
gb_random.fit(X_train_tf, y_train)
gb_random.best_params_
y_train_preds = gb.predict_proba(X_train_tf)[:,1]
y_valid_preds = gb.predict_proba(X_valid_tf)[:,1]
print('Baseline Gradient boosting')
gb_train_base_auc = roc_auc_score(y_train, y_train_preds)
gb_valid_base_auc = roc_auc_score(y_valid, y_valid_preds)
print('Training AUC:%.3f'%(gb_train_base_auc))
print('Validation AUC:%.3f'%(gb_valid_base_auc))
print('Optimized Gradient boosting')
gb_train_opt_auc = roc_auc_score(y_train, y_train_preds_random)
gb_valid_opt_auc = roc_auc_score(y_valid, y_valid_preds_random)
y_train_preds_random = gb_random.best_estimator_.predict_proba(X_train_tf)[:,1]
y_valid_preds_random = gb_random.best_estimator_.predict_proba(X_valid_tf)[:,1]
print('Training AUC:%.3f'%(gb_train_opt_auc))
print('Validation AUC:%.3f'%(gb_valid_opt_auc))
df_results = pd.DataFrame({'classifier':['RF','RF','GB','GB'],
'data_set':['baseline','optimized'] * 2,
'auc': [rf_valid_base_auc,rf_valid_opt_auc,
gb_valid_base_auc,gb_valid_opt_auc],})
ax = sns.lineplot(x="classifier", y="auc", hue="data_set", data=df_results)
ax.set_xlabel('Classifier',fontsize = 15)
ax.set_ylabel('AUC', fontsize = 15)
ax.tick_params(labelsize=15)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., fontsize = 15)
plt.show()
import pickle
pickle.dump(gb_random.best_estimator_, open('best_classifier.pkl', 'wb'), protocol = 4)
best_model = pickle.load(open('best_classifier.pkl','rb'))
y_train_preds = best_model.predict_proba(X_train_tf)[:,1]
y_valid_preds = best_model.predict_proba(X_valid_tf)[:,1]
y_test_preds = best_model.predict_proba(X_test_tf)[:,1]
y_train_pred = best_model.predict(X_train_tf)
y_valid_pred = best_model.predict(X_valid_tf)
y_test_pred = best_model.predict(X_test_tf)
print('Training')
train_accuracy, train_specificity,train_recall,train_precision, train_f1, train_MCC,\
= show_metrics(y_train,y_train_pred)
print('Validation')
valid_accuracy, valid_specificity, valid_recall, valid_precision, valid_f1, valid_MCC, \
= show_metrics(y_valid,y_valid_pred)
print('Test')
test_accuracy, test_specificity, test_recall, test_precision, test_f1, test_MCC, \
= show_metrics(y_test,y_test_pred)
train_auc = roc_auc_score(y_train, y_train_preds)
valid_auc = roc_auc_score(y_valid, y_valid_preds)
test_auc = roc_auc_score(y_test, y_test_preds)
from sklearn.metrics import roc_curve
fpr_train, tpr_train, thresholds_train = roc_curve(y_train, y_train_preds)
auc_train = roc_auc_score(y_train, y_train_preds)
fpr_valid, tpr_valid, thresholds_valid = roc_curve(y_valid, y_valid_preds)
auc_valid = roc_auc_score(y_valid, y_valid_preds)
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, y_test_preds)
auc_test = roc_auc_score(y_test, y_test_preds)
plt.plot(fpr_train, tpr_train, 'r-',label ='Train AUC:%.3f'%auc_train)
plt.plot(fpr_valid, tpr_valid, 'b-',label ='Valid AUC:%.3f'%auc_valid)
plt.plot(fpr_test, tpr_test, 'g-',label ='Test AUC:%.3f'%auc_test)
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()